{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"CH03.0X Tokenizer.ipynb","provenance":[],"collapsed_sections":[]},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.6"},"widgets":{"application/vnd.jupyter.widget-state+json":{"818dd77dbc604ecc9325fc498b4601c3":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","state":{"_view_name":"HBoxView","_dom_classes":[],"_model_name":"HBoxModel","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.5.0","box_style":"","layout":"IPY_MODEL_4b00a00f16fa4ce3b6fcff6a5bf1bd19","_model_module":"@jupyter-widgets/controls","children":["IPY_MODEL_cfd5307cafa147bd9954022930e7a844","IPY_MODEL_6e6c17ffa90c4745aee961fb48e3f9e4"]}},"4b00a00f16fa4ce3b6fcff6a5bf1bd19":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"cfd5307cafa147bd9954022930e7a844":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","state":{"_view_name":"ProgressView","style":"IPY_MODEL_66ace374eb574853902bc81c61d23f1c","_dom_classes":[],"description":"Downloading: 100%","_model_name":"FloatProgressModel","bar_style":"success","max":385,"_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":385,"_view_count":null,"_view_module_version":"1.5.0","orientation":"horizontal","min":0,"description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_9b37af413d594ab4adf18ff2f1e539c5"}},"6e6c17ffa90c4745aee961fb48e3f9e4":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","state":{"_view_name":"HTMLView","style":"IPY_MODEL_2961f7351cf84b3492fff54a4cb0a88a","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"​","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":" 385/385 [00:05&lt;00:00, 76.2B/s]","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_dce37a91bcdf4fc897df8bb014acda4e"}},"66ace374eb574853902bc81c61d23f1c":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","state":{"_view_name":"StyleView","_model_name":"ProgressStyleModel","description_width":"initial","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","bar_color":null,"_model_module":"@jupyter-widgets/controls"}},"9b37af413d594ab4adf18ff2f1e539c5":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"2961f7351cf84b3492fff54a4cb0a88a":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"dce37a91bcdf4fc897df8bb014acda4e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"f44e3dab2caf465696cfa4770b8db21b":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","state":{"_view_name":"HBoxView","_dom_classes":[],"_model_name":"HBoxModel","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.5.0","box_style":"","layout":"IPY_MODEL_b3ababad6e5f49f7a9707af922d972d2","_model_module":"@jupyter-widgets/controls","children":["IPY_MODEL_caf52de29a03408f83ea36218a640e87","IPY_MODEL_c063ed41787247a9a2dc0ea3d72179dd"]}},"b3ababad6e5f49f7a9707af922d972d2":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"caf52de29a03408f83ea36218a640e87":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","state":{"_view_name":"ProgressView","style":"IPY_MODEL_e49259feedaf4ad797bfd4415e303d79","_dom_classes":[],"description":"Downloading: 100%","_model_name":"FloatProgressModel","bar_style":"success","max":262620,"_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":262620,"_view_count":null,"_view_module_version":"1.5.0","orientation":"horizontal","min":0,"description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_765afc722c3c4220ae882a905e8a5b97"}},"c063ed41787247a9a2dc0ea3d72179dd":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","state":{"_view_name":"HTMLView","style":"IPY_MODEL_639a03e6264647af9b1e7427f0668bdd","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"​","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":" 263k/263k [00:03&lt;00:00, 74.3kB/s]","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_802706746a894ce98c57638167c3f0ac"}},"e49259feedaf4ad797bfd4415e303d79":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","state":{"_view_name":"StyleView","_model_name":"ProgressStyleModel","description_width":"initial","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","bar_color":null,"_model_module":"@jupyter-widgets/controls"}},"765afc722c3c4220ae882a905e8a5b97":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"639a03e6264647af9b1e7427f0668bdd":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"802706746a894ce98c57638167c3f0ac":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"7699afba90cb4cfc81c11c6f2d2a5863":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","state":{"_view_name":"HBoxView","_dom_classes":[],"_model_name":"HBoxModel","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.5.0","box_style":"","layout":"IPY_MODEL_f6106d24193c43f684d7cfe1c4da1e9a","_model_module":"@jupyter-widgets/controls","children":["IPY_MODEL_3f0adc2278c649a28c6981511c4de863","IPY_MODEL_8650298fba1946a6b92225396a9c1ae6"]}},"f6106d24193c43f684d7cfe1c4da1e9a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"3f0adc2278c649a28c6981511c4de863":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","state":{"_view_name":"ProgressView","style":"IPY_MODEL_71d9c9c705e54acca1772341a56bf62d","_dom_classes":[],"description":"Downloading: 100%","_model_name":"FloatProgressModel","bar_style":"success","max":59,"_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":59,"_view_count":null,"_view_module_version":"1.5.0","orientation":"horizontal","min":0,"description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_6a808381740d4e66ad9154e0c82cab11"}},"8650298fba1946a6b92225396a9c1ae6":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","state":{"_view_name":"HTMLView","style":"IPY_MODEL_60d12a6cecb441e285f1ab655cf2c6c6","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"​","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":" 59.0/59.0 [00:00&lt;00:00, 600B/s]","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_322f9eb066284012a678840d18109013"}},"71d9c9c705e54acca1772341a56bf62d":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","state":{"_view_name":"StyleView","_model_name":"ProgressStyleModel","description_width":"initial","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","bar_color":null,"_model_module":"@jupyter-widgets/controls"}},"6a808381740d4e66ad9154e0c82cab11":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"60d12a6cecb441e285f1ab655cf2c6c6":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"322f9eb066284012a678840d18109013":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"b52766d128134a49a3edc5a96fc00322":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","state":{"_view_name":"HBoxView","_dom_classes":[],"_model_name":"HBoxModel","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.5.0","box_style":"","layout":"IPY_MODEL_ac28188373bb48b2a81a6d26e40da5ae","_model_module":"@jupyter-widgets/controls","children":["IPY_MODEL_9a6012a2fd97403a8bec9b86cd3928d4","IPY_MODEL_b9e2d4bc686941e1afa3eb3f75a5eb47"]}},"ac28188373bb48b2a81a6d26e40da5ae":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"9a6012a2fd97403a8bec9b86cd3928d4":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","state":{"_view_name":"ProgressView","style":"IPY_MODEL_dabc78fd3b74416ead6970acee34ac93","_dom_classes":[],"description":"Downloading: 100%","_model_name":"FloatProgressModel","bar_style":"success","max":433,"_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":433,"_view_count":null,"_view_module_version":"1.5.0","orientation":"horizontal","min":0,"description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_22d6aaf6a77b4d21a038ca89df1b0ad9"}},"b9e2d4bc686941e1afa3eb3f75a5eb47":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","state":{"_view_name":"HTMLView","style":"IPY_MODEL_7410b5eab09e494c841923e3fa6ed7b2","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"​","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":" 433/433 [00:58&lt;00:00, 7.34B/s]","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_92bb3df784f04f7eb76f729907e21c95"}},"dabc78fd3b74416ead6970acee34ac93":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","state":{"_view_name":"StyleView","_model_name":"ProgressStyleModel","description_width":"initial","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","bar_color":null,"_model_module":"@jupyter-widgets/controls"}},"22d6aaf6a77b4d21a038ca89df1b0ad9":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"7410b5eab09e494c841923e3fa6ed7b2":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"92bb3df784f04f7eb76f729907e21c95":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"63cf4749a2084c199994ef5e26d3d4e5":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","state":{"_view_name":"HBoxView","_dom_classes":[],"_model_name":"HBoxModel","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.5.0","box_style":"","layout":"IPY_MODEL_d42f138d12d64c0c9531369c0d463969","_model_module":"@jupyter-widgets/controls","children":["IPY_MODEL_e4f5bcea9d2548a7a42ea23030947f02","IPY_MODEL_67c59c5be0a24ca9b0f2a25e4b1ea004"]}},"d42f138d12d64c0c9531369c0d463969":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"e4f5bcea9d2548a7a42ea23030947f02":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","state":{"_view_name":"ProgressView","style":"IPY_MODEL_0c491960dea845a5b8b8d27d1864f452","_dom_classes":[],"description":"Downloading: 100%","_model_name":"FloatProgressModel","bar_style":"success","max":231508,"_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":231508,"_view_count":null,"_view_module_version":"1.5.0","orientation":"horizontal","min":0,"description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_32f83e7775374ff5b106a1ae9f3d67ac"}},"67c59c5be0a24ca9b0f2a25e4b1ea004":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","state":{"_view_name":"HTMLView","style":"IPY_MODEL_6ea77c223d99429684f0f75f58a35d0e","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"​","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":" 232k/232k [00:05&lt;00:00, 43.9kB/s]","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_78c89eda010e4e648cd9a29e93a2247a"}},"0c491960dea845a5b8b8d27d1864f452":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","state":{"_view_name":"StyleView","_model_name":"ProgressStyleModel","description_width":"initial","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","bar_color":null,"_model_module":"@jupyter-widgets/controls"}},"32f83e7775374ff5b106a1ae9f3d67ac":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"6ea77c223d99429684f0f75f58a35d0e":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"78c89eda010e4e648cd9a29e93a2247a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"a92ddd3a4eb4428f8c94043d65dea375":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","state":{"_view_name":"HBoxView","_dom_classes":[],"_model_name":"HBoxModel","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.5.0","box_style":"","layout":"IPY_MODEL_6dcfdc82ac914257ac958bd3857fece6","_model_module":"@jupyter-widgets/controls","children":["IPY_MODEL_55a850af015742f494a8370c3055d5fa","IPY_MODEL_4a1414ceb6cf46dda62e83011434853d"]}},"6dcfdc82ac914257ac958bd3857fece6":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"55a850af015742f494a8370c3055d5fa":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","state":{"_view_name":"ProgressView","style":"IPY_MODEL_8c9f002f74c74dafbe654c8c37fa0fcc","_dom_classes":[],"description":"Downloading: 100%","_model_name":"FloatProgressModel","bar_style":"success","max":466062,"_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":466062,"_view_count":null,"_view_module_version":"1.5.0","orientation":"horizontal","min":0,"description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_afe6405c66424f669b278a88a9556acf"}},"4a1414ceb6cf46dda62e83011434853d":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","state":{"_view_name":"HTMLView","style":"IPY_MODEL_c54ebda5ef4248b0bf2b380202b77229","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"​","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":" 466k/466k [00:03&lt;00:00, 137kB/s]","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_89160be576ca4c4489da83d2c5f370d5"}},"8c9f002f74c74dafbe654c8c37fa0fcc":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","state":{"_view_name":"StyleView","_model_name":"ProgressStyleModel","description_width":"initial","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","bar_color":null,"_model_module":"@jupyter-widgets/controls"}},"afe6405c66424f669b278a88a9556acf":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"c54ebda5ef4248b0bf2b380202b77229":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"89160be576ca4c4489da83d2c5f370d5":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"6b49494c08e848ff822e3935c046d25e":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","state":{"_view_name":"HBoxView","_dom_classes":[],"_model_name":"HBoxModel","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.5.0","box_style":"","layout":"IPY_MODEL_edf23bd21227425ba0680935bff4dfe9","_model_module":"@jupyter-widgets/controls","children":["IPY_MODEL_ca92a8d0f7154943b5471d27d532bfb0","IPY_MODEL_6e374000324142f0ac199b171756bbe5"]}},"edf23bd21227425ba0680935bff4dfe9":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"ca92a8d0f7154943b5471d27d532bfb0":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","state":{"_view_name":"ProgressView","style":"IPY_MODEL_03d895e36a684ffeb8bbf241e8ffb0c8","_dom_classes":[],"description":"Downloading: 100%","_model_name":"FloatProgressModel","bar_style":"success","max":28,"_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":28,"_view_count":null,"_view_module_version":"1.5.0","orientation":"horizontal","min":0,"description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_76ad803165c840d28ef3c2ce316aed47"}},"6e374000324142f0ac199b171756bbe5":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","state":{"_view_name":"HTMLView","style":"IPY_MODEL_b6932a8f692749fa8b68f7ea6da273ee","_dom_classes":[],"description":"","_model_name":"HTMLModel","placeholder":"​","_view_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","value":" 28.0/28.0 [00:52&lt;00:00, 1.88s/B]","_view_count":null,"_view_module_version":"1.5.0","description_tooltip":null,"_model_module":"@jupyter-widgets/controls","layout":"IPY_MODEL_a72bb35a6e54416a95bf0dfd8d1044b2"}},"03d895e36a684ffeb8bbf241e8ffb0c8":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","state":{"_view_name":"StyleView","_model_name":"ProgressStyleModel","description_width":"initial","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","bar_color":null,"_model_module":"@jupyter-widgets/controls"}},"76ad803165c840d28ef3c2ce316aed47":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}},"b6932a8f692749fa8b68f7ea6da273ee":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","state":{"_view_name":"StyleView","_model_name":"DescriptionStyleModel","description_width":"","_view_module":"@jupyter-widgets/base","_model_module_version":"1.5.0","_view_count":null,"_view_module_version":"1.2.0","_model_module":"@jupyter-widgets/controls"}},"a72bb35a6e54416a95bf0dfd8d1044b2":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","state":{"_view_name":"LayoutView","grid_template_rows":null,"right":null,"justify_content":null,"_view_module":"@jupyter-widgets/base","overflow":null,"_model_module_version":"1.2.0","_view_count":null,"flex_flow":null,"width":null,"min_width":null,"border":null,"align_items":null,"bottom":null,"_model_module":"@jupyter-widgets/base","top":null,"grid_column":null,"overflow_y":null,"overflow_x":null,"grid_auto_flow":null,"grid_area":null,"grid_template_columns":null,"flex":null,"_model_name":"LayoutModel","justify_items":null,"grid_row":null,"max_height":null,"align_content":null,"visibility":null,"align_self":null,"height":null,"min_height":null,"padding":null,"grid_auto_rows":null,"grid_gap":null,"max_width":null,"order":null,"_view_module_version":"1.2.0","grid_template_areas":null,"object_position":null,"object_fit":null,"grid_auto_columns":null,"margin":null,"display":null,"left":null}}}}},"cells":[{"cell_type":"markdown","metadata":{"id":"TRDYJW5ljYwK","outputId":"3547ca78-10f2-4790-f6a1-90669785b58b"},"source":["# Tokenization"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"wXomV31vsGue","executionInfo":{"status":"ok","timestamp":1618945812880,"user_tz":-180,"elapsed":8411,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"c9426ada-d515-4081-d0ea-6920c9fb526a"},"source":["!pip install transformers"],"execution_count":2,"outputs":[{"output_type":"stream","text":["Collecting transformers\n","\u001b[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)\n","\u001b[K     |████████████████████████████████| 2.1MB 5.3MB/s \n","\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n","Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)\n","Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n","Collecting sacremoses\n","\u001b[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)\n","\u001b[K     |████████████████████████████████| 901kB 34.8MB/s \n","\u001b[?25hCollecting tokenizers<0.11,>=0.10.1\n","\u001b[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)\n","\u001b[K     |████████████████████████████████| 3.3MB 37.7MB/s \n","\u001b[?25hRequirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from transformers) (3.10.1)\n","Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.41.1)\n","Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers) (20.9)\n","Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.0.12)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2020.12.5)\n","Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n","Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n","Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n","Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)\n","Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.0.1)\n","Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n","Requirement already satisfied: typing-extensions>=3.6.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers) (3.7.4.3)\n","Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers) (3.4.1)\n","Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers) (2.4.7)\n","Installing collected packages: sacremoses, tokenizers, transformers\n","Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"aSOk-ouhr6VV"},"source":["## Loading a Turkish Pre-trained Tokenizer"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":204,"referenced_widgets":["818dd77dbc604ecc9325fc498b4601c3","4b00a00f16fa4ce3b6fcff6a5bf1bd19","cfd5307cafa147bd9954022930e7a844","6e6c17ffa90c4745aee961fb48e3f9e4","66ace374eb574853902bc81c61d23f1c","9b37af413d594ab4adf18ff2f1e539c5","2961f7351cf84b3492fff54a4cb0a88a","dce37a91bcdf4fc897df8bb014acda4e","f44e3dab2caf465696cfa4770b8db21b","b3ababad6e5f49f7a9707af922d972d2","caf52de29a03408f83ea36218a640e87","c063ed41787247a9a2dc0ea3d72179dd","e49259feedaf4ad797bfd4415e303d79","765afc722c3c4220ae882a905e8a5b97","639a03e6264647af9b1e7427f0668bdd","802706746a894ce98c57638167c3f0ac","7699afba90cb4cfc81c11c6f2d2a5863","f6106d24193c43f684d7cfe1c4da1e9a","3f0adc2278c649a28c6981511c4de863","8650298fba1946a6b92225396a9c1ae6","71d9c9c705e54acca1772341a56bf62d","6a808381740d4e66ad9154e0c82cab11","60d12a6cecb441e285f1ab655cf2c6c6","322f9eb066284012a678840d18109013"]},"id":"K-4KLgJ5r6VW","executionInfo":{"status":"ok","timestamp":1618945872698,"user_tz":-180,"elapsed":9870,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"6152cdbe-fa9d-4eb8-8aa4-7c8ea931f9bf"},"source":["from transformers import AutoModel, AutoTokenizer\n","tokenizerTUR = AutoTokenizer.from_pretrained(\"dbmdz/bert-base-turkish-uncased\",)\n","print(f\"VOC size is: {tokenizerTUR.vocab_size}\")\n","print(f\"The model is {type(tokenizerTUR)}\")"],"execution_count":3,"outputs":[{"output_type":"display_data","data":{"application/vnd.jupyter.widget-view+json":{"model_id":"818dd77dbc604ecc9325fc498b4601c3","version_minor":0,"version_major":2},"text/plain":["HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…"]},"metadata":{"tags":[]}},{"output_type":"stream","text":["\n"],"name":"stdout"},{"output_type":"display_data","data":{"application/vnd.jupyter.widget-view+json":{"model_id":"f44e3dab2caf465696cfa4770b8db21b","version_minor":0,"version_major":2},"text/plain":["HBox(children=(FloatProgress(value=0.0, description='Downloading', max=262620.0, style=ProgressStyle(descripti…"]},"metadata":{"tags":[]}},{"output_type":"stream","text":["\n"],"name":"stdout"},{"output_type":"display_data","data":{"application/vnd.jupyter.widget-view+json":{"model_id":"7699afba90cb4cfc81c11c6f2d2a5863","version_minor":0,"version_major":2},"text/plain":["HBox(children=(FloatProgress(value=0.0, description='Downloading', max=59.0, style=ProgressStyle(description_w…"]},"metadata":{"tags":[]}},{"output_type":"stream","text":["\n","VOC size is: 32000\n","The model is <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"S3caioNCr6VY","executionInfo":{"status":"ok","timestamp":1618945877297,"user_tz":-180,"elapsed":1076,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}}},"source":[""],"execution_count":3,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"aHtaOWRfr6VY"},"source":["## Loading an English Pre-trained Tokenizer"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":254,"referenced_widgets":["b52766d128134a49a3edc5a96fc00322","ac28188373bb48b2a81a6d26e40da5ae","9a6012a2fd97403a8bec9b86cd3928d4","b9e2d4bc686941e1afa3eb3f75a5eb47","dabc78fd3b74416ead6970acee34ac93","22d6aaf6a77b4d21a038ca89df1b0ad9","7410b5eab09e494c841923e3fa6ed7b2","92bb3df784f04f7eb76f729907e21c95","63cf4749a2084c199994ef5e26d3d4e5","d42f138d12d64c0c9531369c0d463969","e4f5bcea9d2548a7a42ea23030947f02","67c59c5be0a24ca9b0f2a25e4b1ea004","0c491960dea845a5b8b8d27d1864f452","32f83e7775374ff5b106a1ae9f3d67ac","6ea77c223d99429684f0f75f58a35d0e","78c89eda010e4e648cd9a29e93a2247a","a92ddd3a4eb4428f8c94043d65dea375","6dcfdc82ac914257ac958bd3857fece6","55a850af015742f494a8370c3055d5fa","4a1414ceb6cf46dda62e83011434853d","8c9f002f74c74dafbe654c8c37fa0fcc","afe6405c66424f669b278a88a9556acf","c54ebda5ef4248b0bf2b380202b77229","89160be576ca4c4489da83d2c5f370d5","6b49494c08e848ff822e3935c046d25e","edf23bd21227425ba0680935bff4dfe9","ca92a8d0f7154943b5471d27d532bfb0","6e374000324142f0ac199b171756bbe5","03d895e36a684ffeb8bbf241e8ffb0c8","76ad803165c840d28ef3c2ce316aed47","b6932a8f692749fa8b68f7ea6da273ee","a72bb35a6e54416a95bf0dfd8d1044b2"]},"id":"HBUiytDGr6VY","executionInfo":{"status":"ok","timestamp":1618945888550,"user_tz":-180,"elapsed":9164,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"cd92c08a-0c19-4fb7-88ad-40f8a3ef7a9b"},"source":["from transformers import AutoModel, AutoTokenizer\n","tokenizerEN = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n","print(f\"VOC size is: {tokenizerEN.vocab_size}\")\n","print(f\"The model is {type(tokenizerEN)}\")"],"execution_count":4,"outputs":[{"output_type":"display_data","data":{"application/vnd.jupyter.widget-view+json":{"model_id":"b52766d128134a49a3edc5a96fc00322","version_minor":0,"version_major":2},"text/plain":["HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…"]},"metadata":{"tags":[]}},{"output_type":"stream","text":["\n"],"name":"stdout"},{"output_type":"display_data","data":{"application/vnd.jupyter.widget-view+json":{"model_id":"63cf4749a2084c199994ef5e26d3d4e5","version_minor":0,"version_major":2},"text/plain":["HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…"]},"metadata":{"tags":[]}},{"output_type":"stream","text":["\n"],"name":"stdout"},{"output_type":"display_data","data":{"application/vnd.jupyter.widget-view+json":{"model_id":"a92ddd3a4eb4428f8c94043d65dea375","version_minor":0,"version_major":2},"text/plain":["HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…"]},"metadata":{"tags":[]}},{"output_type":"stream","text":["\n"],"name":"stdout"},{"output_type":"display_data","data":{"application/vnd.jupyter.widget-view+json":{"model_id":"6b49494c08e848ff822e3935c046d25e","version_minor":0,"version_major":2},"text/plain":["HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…"]},"metadata":{"tags":[]}},{"output_type":"stream","text":["\n","VOC size is: 30522\n","The model is <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"r99VtKxqr6VZ","executionInfo":{"status":"ok","timestamp":1618945893170,"user_tz":-180,"elapsed":1106,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"ebcfce9f-86b7-4756-b676-d924b49dd55c"},"source":["word_en=\"telecommunications\"\n","print(f\"is in Turkish Model ? {word_en in tokenizerTUR.vocab}\")\n","print(f\"is in English Model ? {word_en in tokenizerEN.vocab}\")"],"execution_count":5,"outputs":[{"output_type":"stream","text":["is in Turkish Model ? False\n","is in English Model ? True\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"0BcZPXHIr6Va","executionInfo":{"status":"ok","timestamp":1618945895008,"user_tz":-180,"elapsed":758,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"c2f5b2a2-eebd-4737-a269-c2dab09aa73c"},"source":["tokens=tokenizerTUR.tokenize(word_en)\n","tokens"],"execution_count":6,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['tel', '##eco', '##mm', '##un', '##ica', '##tions']"]},"metadata":{"tags":[]},"execution_count":6}]},{"cell_type":"markdown","metadata":{"id":"UZLvMW4Vr6Va"},"source":["But, The pieces are in the Turkish model"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"-Tjxu8vcr6Vb","executionInfo":{"status":"ok","timestamp":1618945898141,"user_tz":-180,"elapsed":1482,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"6be06f90-a573-47aa-e509-7022d88b7f10"},"source":["[t in tokenizerTUR.vocab for t in tokens]"],"execution_count":7,"outputs":[{"output_type":"execute_result","data":{"text/plain":["[True, True, True, True, True, True]"]},"metadata":{"tags":[]},"execution_count":7}]},{"cell_type":"code","metadata":{"id":"nWzifjghr6Vb","executionInfo":{"status":"ok","timestamp":1618945906894,"user_tz":-180,"elapsed":1149,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}}},"source":[""],"execution_count":7,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"JDO-9FnCr6Vb","executionInfo":{"status":"ok","timestamp":1618945906896,"user_tz":-180,"elapsed":647,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"a4a6e1d1-56c5-458f-8a7d-dd39e2542085"},"source":["tokens= tokenizerEN.tokenize(word_en)\n","tokens"],"execution_count":8,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['telecommunications']"]},"metadata":{"tags":[]},"execution_count":8}]},{"cell_type":"code","metadata":{"id":"NWcEicrrr6Vc","executionInfo":{"status":"ok","timestamp":1618945913635,"user_tz":-180,"elapsed":1204,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}}},"source":["long_word_tur=\"Muvaffakiyetsizleştiricileştiriveremeyebileceklerimizdenmişsinizcesine\""],"execution_count":9,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"mO0ewrDIr6Vc"},"source":["'''\n","It means that “As though you happen to have been from among those whom we will not be able to easily/quickly make a maker of unsuccessful ones” \n","'''"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"1hibagk2r6Vd","executionInfo":{"status":"ok","timestamp":1618945915728,"user_tz":-180,"elapsed":612,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"8d86dc97-5eb4-4b8f-e01a-5335216d6af4"},"source":["print(tokenizerTUR.tokenize(long_word_tur))"],"execution_count":10,"outputs":[{"output_type":"stream","text":["['muvaffak', '##iyet', '##siz', '##les', '##tir', '##ici', '##les', '##tir', '##iver', '##emeye', '##bilecekleri', '##mi', '##z', '##den', '##mis', '##siniz', '##cesine']\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"D5d08yswr6Vd"},"source":["## Understanding Tokenization Algorithms"]},{"cell_type":"markdown","metadata":{"id":"TpIlpuIgr6Ve"},"source":["### Train tokenizers from scratch"]},{"cell_type":"markdown","metadata":{"id":"zFmJoDDvr6Ve"},"source":["let's load Shakespeare plays from gutenberg project"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"N_XZpN-cr6Ve","executionInfo":{"status":"ok","timestamp":1618945944762,"user_tz":-180,"elapsed":4885,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"4f891350-1682-40d1-ba9d-f4be48694e03"},"source":["import nltk \n","from nltk.corpus import gutenberg \n","nltk.download('gutenberg') \n","nltk.download('punkt') \n","plays=['shakespeare-macbeth.txt','shakespeare-hamlet.txt','shakespeare-caesar.txt']\n","shakespeare=[\" \".join(s) for ply in plays for s in gutenberg.sents(ply)]"],"execution_count":11,"outputs":[{"output_type":"stream","text":["[nltk_data] Downloading package gutenberg to /root/nltk_data...\n","[nltk_data]   Unzipping corpora/gutenberg.zip.\n","[nltk_data] Downloading package punkt to /root/nltk_data...\n","[nltk_data]   Unzipping tokenizers/punkt.zip.\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"tuQd7uvbkixB"},"source":["# We prepare a template for the post-processing \n","# Some initial settings"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"xoUf31Wxr6Vf","executionInfo":{"status":"ok","timestamp":1618945950174,"user_tz":-180,"elapsed":1155,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}}},"source":["from tokenizers.processors import TemplateProcessing\n","special_tokens= [\"[UNK]\", \"[CLS]\", \"[SEP]\", \"[PAD]\", \"[MASK]\"]\n","temp_proc= TemplateProcessing(\n","    single=\"[CLS] $A [SEP]\",\n","    pair=\"[CLS] $A [SEP] $B:1 [SEP]:1\",\n","    special_tokens=[\n","        (\"[CLS]\", special_tokens.index(\"[CLS]\")),\n","        (\"[SEP]\", special_tokens.index(\"[SEP]\")),\n","    ],\n",")"],"execution_count":12,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"fIY49Bv7r6Vf"},"source":["## Training BPE"]},{"cell_type":"code","metadata":{"id":"mkfZRm2Sh0Tz","executionInfo":{"status":"ok","timestamp":1618945953052,"user_tz":-180,"elapsed":1387,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}}},"source":["from tokenizers import Tokenizer\n","from tokenizers.normalizers import (Sequence,Lowercase, NFD, StripAccents)\n","from tokenizers.pre_tokenizers import Whitespace\n","from tokenizers.models import BPE\n","from tokenizers.decoders import BPEDecoder\n","\n","# Instantiate BPE (Byte-Pair Encoding)\n","tokenizer = Tokenizer(BPE())\n","\n","# a unicode normalizer, lowercasing and , replacing accents in order  :\n","# * Sequence : It composes multiple PreTokenizer that will be run in the given order\n","tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])\n","\n","# Whitespace: Splits on word boundaries using the regular expression \\w+|[^\\w\\s]+ \n","tokenizer.pre_tokenizer = Whitespace() \n","tokenizer.decoder = BPEDecoder()\n","tokenizer.post_processor=temp_proc"],"execution_count":13,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"1L-0PgEhr6Vg"},"source":["We are ready to train the model "]},{"cell_type":"code","metadata":{"id":"Ikfle7-fizWI","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1618945957592,"user_tz":-180,"elapsed":1426,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"4b7dce04-e217-4152-c92e-d25a96a443d2"},"source":["from tokenizers.trainers import BpeTrainer\n","trainer = BpeTrainer(vocab_size=5000, special_tokens= special_tokens)\n","tokenizer.train_from_iterator(shakespeare, trainer=trainer)\n","print(f\"Trained vocab size: {tokenizer.get_vocab_size()}\" )"],"execution_count":14,"outputs":[{"output_type":"stream","text":["Trained vocab size: 5000\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"aa_s1NhglLRF","executionInfo":{"status":"ok","timestamp":1618945959528,"user_tz":-180,"elapsed":1170,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}}},"source":["# take a sentence from macbeth"],"execution_count":15,"outputs":[]},{"cell_type":"code","metadata":{"id":"cvzv2x7Nr6Vh","executionInfo":{"status":"ok","timestamp":1618945967994,"user_tz":-180,"elapsed":993,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}}},"source":[""],"execution_count":15,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"uBV4W4eovhr5","executionInfo":{"status":"ok","timestamp":1618945970624,"user_tz":-180,"elapsed":914,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"6d9a2785-4a3d-4562-9234-9eb5b581e988"},"source":["sen= \"Is this a dagger which I see before me, the handle toward my hand?\"\n","sen_enc=tokenizer.encode(sen)\n","print(f\"Output: {format(sen_enc.tokens)}\")"],"execution_count":16,"outputs":[{"output_type":"stream","text":["Output: ['[CLS]', 'is', 'this', 'a', 'dagger', 'which', 'i', 'see', 'before', 'me', ',', 'the', 'hand', 'le', 'toward', 'my', 'hand', '?', '[SEP]']\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"yoYfnvcKpvQJ","executionInfo":{"status":"ok","timestamp":1618945972850,"user_tz":-180,"elapsed":635,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}}},"source":["sen_enc2=tokenizer.encode(\"Macbeth and Hugging Face\")"],"execution_count":17,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"DYVdvb6_p3aa","executionInfo":{"status":"ok","timestamp":1618945974703,"user_tz":-180,"elapsed":877,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"11bfa3b4-c57e-4c2c-e042-655a9aaea470"},"source":["print(f\"Output: {format(sen_enc2.tokens)}\")"],"execution_count":18,"outputs":[{"output_type":"stream","text":["Output: ['[CLS]', 'macbeth', 'and', 'hu', 'gg', 'ing', 'face', '[SEP]']\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"M0LpnYmhqV53","executionInfo":{"status":"ok","timestamp":1618945976893,"user_tz":-180,"elapsed":1026,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}}},"source":["# Let us pass  two sentences"],"execution_count":19,"outputs":[]},{"cell_type":"code","metadata":{"id":"ayD7kPfl-uYL","executionInfo":{"status":"ok","timestamp":1618945981947,"user_tz":-180,"elapsed":1306,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}}},"source":["two_enc=tokenizer.encode(\"I like Hugging Face!\",\"He likes Macbeth!\")"],"execution_count":20,"outputs":[]},{"cell_type":"code","metadata":{"id":"QCp2AGet-uaq","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1618945984983,"user_tz":-180,"elapsed":1093,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"8629bcda-c401-4f67-ea6d-06987c1a9364"},"source":["print(f\"Output: {format(two_enc.tokens)}\")"],"execution_count":21,"outputs":[{"output_type":"stream","text":["Output: ['[CLS]', 'i', 'like', 'hu', 'gg', 'ing', 'face', '!', '[SEP]', 'he', 'likes', 'macbeth', '!', '[SEP]']\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"_s5-XzSPrdKi","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1618945988911,"user_tz":-180,"elapsed":1317,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"e9d16a8c-79f2-4528-db6a-26dbb933974b"},"source":["tokenizer.model.save('.')"],"execution_count":22,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['./vocab.json', './merges.txt']"]},"metadata":{"tags":[]},"execution_count":22}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"P74JPOQsrjxi","executionInfo":{"status":"ok","timestamp":1618945991080,"user_tz":-180,"elapsed":1424,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"6ca65a8b-4b18-4a02-b989-422a854ce017"},"source":["!wc -l ./merges.txt"],"execution_count":23,"outputs":[{"output_type":"stream","text":["4948 ./merges.txt\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ipocLFsxsjim","executionInfo":{"status":"ok","timestamp":1618945993313,"user_tz":-180,"elapsed":1692,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"7fdf2ddd-ce2d-4a10-ac37-9c6ebb84481e"},"source":["!head -6 ./merges.txt"],"execution_count":24,"outputs":[{"output_type":"stream","text":["#version: 0.2 - Trained by `huggingface/tokenizers`\n","t h\n","o u\n","a n\n","th e\n","r e\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"p6IDCioesHTd","executionInfo":{"status":"ok","timestamp":1618945994439,"user_tz":-180,"elapsed":1177,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"ddae6d2b-9651-4084-8636-cda12ae6f4f1"},"source":["!head -1000 ./merges.txt| tail -5"],"execution_count":25,"outputs":[{"output_type":"stream","text":["ch ance\n","si g\n","your s\n","ti a\n","po int\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"MgJ_9XbQQqxO","executionInfo":{"status":"ok","timestamp":1618945995788,"user_tz":-180,"elapsed":1418,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}}},"source":["# Save and Load Tokenizer"],"execution_count":26,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cJzzz-oTQq2E","executionInfo":{"status":"ok","timestamp":1618945995789,"user_tz":-180,"elapsed":1175,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"b939c3bb-8179-4abc-dcdb-e85891d2826f"},"source":["tokenizer.save(\"MyBPETokenizer.json\")\n","tokenizerFromFile=Tokenizer.from_file(\"MyBPETokenizer.json\")\n","sen_enc3 = tokenizerFromFile.encode(\"I like HuggingFace and Macbeth\")\n","print(f\"Output: {format(sen_enc3.tokens)}\")"],"execution_count":27,"outputs":[{"output_type":"stream","text":["Output: ['[CLS]', 'i', 'like', 'hu', 'gg', 'ing', 'face', 'and', 'macbeth', '[SEP]']\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"7cqr6iYOr6Vm"},"source":["## Training WordPiece"]},{"cell_type":"code","metadata":{"id":"3DZVqwXd37B3","executionInfo":{"status":"ok","timestamp":1618945999913,"user_tz":-180,"elapsed":590,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}}},"source":["from tokenizers.models import WordPiece\n","from tokenizers.decoders import WordPiece as WordPieceDecoder\n","from tokenizers.normalizers import BertNormalizer \n","\n","#BERT normalizer includes cleaning the text, handling accents, chinese chars and lowercasing\n","\n","tokenizer = Tokenizer(WordPiece())\n","tokenizer.normalizer=BertNormalizer()\n","tokenizer.pre_tokenizer = Whitespace()\n","\n","tokenizer.decoder= WordPieceDecoder()"],"execution_count":28,"outputs":[]},{"cell_type":"code","metadata":{"id":"aGo90r1WD8ea","executionInfo":{"status":"ok","timestamp":1618946002333,"user_tz":-180,"elapsed":1216,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}}},"source":[""],"execution_count":28,"outputs":[]},{"cell_type":"code","metadata":{"id":"zIQrels54Jsn","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1618946003168,"user_tz":-180,"elapsed":1556,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"cabf21cd-45fc-4d6b-cc77-9b66e7addd9a"},"source":["from tokenizers.trainers import WordPieceTrainer\n","trainer = WordPieceTrainer(vocab_size=5000, special_tokens=[\"[UNK]\", \"[CLS]\", \"[SEP]\", \"[PAD]\", \"[MASK]\"])\n","\n","tokenizer.train_from_iterator(shakespeare, trainer=trainer)\n","output = tokenizer.encode(sen)\n","print(output.tokens)"],"execution_count":29,"outputs":[{"output_type":"stream","text":["['is', 'this', 'a', 'dagger', 'which', 'i', 'see', 'before', 'me', ',', 'the', 'hand', '##le', 'toward', 'my', 'hand', '?']\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"09-gAIpEr6Vo","executionInfo":{"status":"ok","timestamp":1618946005594,"user_tz":-180,"elapsed":1677,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}}},"source":["# let us use WordPiece Decoder to treat the sentences properly."],"execution_count":30,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":35},"id":"kQ4zF1nN_4MZ","executionInfo":{"status":"ok","timestamp":1618946006009,"user_tz":-180,"elapsed":1600,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"3bb94c1c-7f61-4b0e-d519-3c22e6ac5b4a"},"source":["tokenizer.decode(output.ids)"],"execution_count":31,"outputs":[{"output_type":"execute_result","data":{"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"},"text/plain":["'is this a dagger which i see before me, the handle toward my hand?'"]},"metadata":{"tags":[]},"execution_count":31}]},{"cell_type":"code","metadata":{"id":"VNrqDN3zANUN","executionInfo":{"status":"ok","timestamp":1618946006010,"user_tz":-180,"elapsed":1259,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}}},"source":["# force the model to produce UNK tokens"],"execution_count":32,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"sdTfTeXYAP9b","executionInfo":{"status":"ok","timestamp":1618946007558,"user_tz":-180,"elapsed":671,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"18135161-c689-4168-9d7e-fd8a1b89f877"},"source":["tokenizer.encode(\"Kralsın aslansın Macbeth!\").tokens"],"execution_count":33,"outputs":[{"output_type":"execute_result","data":{"text/plain":["['[UNK]', '[UNK]', 'macbeth', '!']"]},"metadata":{"tags":[]},"execution_count":33}]},{"cell_type":"code","metadata":{"id":"7nZnpYDJr6Vp","executionInfo":{"status":"ok","timestamp":1618946022807,"user_tz":-180,"elapsed":754,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}}},"source":[""],"execution_count":33,"outputs":[]},{"cell_type":"code","metadata":{"id":"S9PfRrPOtzwq"},"source":[""],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"5pv1elLZr6Vq"},"source":["# Pre-made tokenizers \n","* CharBPETokenizer: The original BPE\n","* ByteLevelBPETokenizer: The byte level version of the BPE\n","* SentencePieceBPETokenizer: A BPE implementation compatible with the one used by SentencePiece\n","* BertWordPieceTokenizer: The famous Bert tokenizer, using WordPiece"]},{"cell_type":"code","metadata":{"id":"SIl6CjuPr6Vq","executionInfo":{"status":"ok","timestamp":1618946024250,"user_tz":-180,"elapsed":1127,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}}},"source":["# Fast Tokenizers optimized for Research and Production"],"execution_count":34,"outputs":[]},{"cell_type":"code","metadata":{"id":"HbpJJDbwr6Vq","executionInfo":{"status":"ok","timestamp":1618946029232,"user_tz":-180,"elapsed":1083,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}}},"source":["from tokenizers import (ByteLevelBPETokenizer,\n","                            CharBPETokenizer,\n","                            SentencePieceBPETokenizer,\n","                            BertWordPieceTokenizer)"],"execution_count":35,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ztQLBl-ar6Vq","executionInfo":{"status":"ok","timestamp":1618946030990,"user_tz":-180,"elapsed":1011,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"e5c4890e-ec9d-4a49-cd92-64ef32b9e8b7"},"source":["tokenizer= SentencePieceBPETokenizer()\n","print(tokenizer.normalizer)\n","print(tokenizer.pre_tokenizer)\n","print(tokenizer.decoder)\n","print(tokenizer.post_processor)"],"execution_count":36,"outputs":[{"output_type":"stream","text":["<tokenizers.normalizers.NFKC object at 0x7f800f4b38b0>\n","<tokenizers.pre_tokenizers.Metaspace object at 0x7f800f4b3d30>\n","<tokenizers.decoders.Metaspace object at 0x7f800f4f3f30>\n","None\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"9rd2e74vr6Vr"},"source":[""]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"-iroWw9Gr6Vr","executionInfo":{"status":"ok","timestamp":1618946035503,"user_tz":-180,"elapsed":1001,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}},"outputId":"8c9467bb-d1c7-4235-d197-579132d77f1c"},"source":["tokenizer= BertWordPieceTokenizer()\n","print(tokenizer.normalizer)\n","print(tokenizer.pre_tokenizer)\n","print(tokenizer.decoder)\n","print(tokenizer.post_processor)"],"execution_count":37,"outputs":[{"output_type":"stream","text":["<tokenizers.normalizers.BertNormalizer object at 0x7f800f521770>\n","<tokenizers.pre_tokenizers.BertPreTokenizer object at 0x7f800f4af4b0>\n","<tokenizers.decoders.WordPiece object at 0x7f800f7e3b70>\n","None\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"gZppvrcpr6Vr","executionInfo":{"status":"ok","timestamp":1618946046457,"user_tz":-180,"elapsed":982,"user":{"displayName":"Savas Yıldırım","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhdhYZMfq-hvK2xI7HqkzvJuCbfgFrIs4wypQEm5w=s64","userId":"10717726124681851716"}}},"source":[""],"execution_count":38,"outputs":[]},{"cell_type":"code","metadata":{"id":"HwmFUmdcr6Vs"},"source":[""],"execution_count":null,"outputs":[]}]}
